% Program using a maximum likelihood estimator for binary
% outcome model
% g(E(Y) | V) = beta' V
% for Y_i binary: 0 or 1
% The explanatory variables V (of which there are m) are either binary or
% continuous; think of V as a m-dimensional vector for each data point.
% Given n data points, the idea is to find the best estimate for the
% m-dimensional coefficient vector beta, using a maximum likelihood method
%
%
% First, the program generates synthetic data
%
% The program then implements a Newton-Raphson scheme to compute
% the maximum likelihood beta, based on the modification of the weighting
% function from possessing singularities when the probabilities are close
% to 0 or 1, to the modified (SAMLE) version
%
% Calculations are done after making a choice of the number of binary
% variables, and then over a range of the number of continuous variables
%
% Sanjeeva Balasuriya
% January, 2023
%



clear all; close all; clear figures
warning('off','all'); warning

% Setting up parameters and options
%
n = 800; % number of data points (individuals) % 800
mb = 0; % number of binary explanatory variables % 0, 2
maxmc = 20; % maximum number of continuous explanatory variables (ranges from 0 to this value)
whether_randomize = 'n'; % whether to add more randomness in generating 
                         % data by flipping a proportion of the y-values
whether_write_data = 'n'; % whether to write data to file
whether_use_logistic = 'y'; % whether to compare with explicit logistic Newton-Raphson
maxNR = 100; % maximum number of iterations for Newton-Raphson % 500
decimal_places_beta = 4; % aim to get beta to this number of decimals % 4
epsNR = 10^(-decimal_places_beta-2); % stop N-R when difference in betas is below this value
method = "logit"; % (choose "atanh" or "log" or "logit" or "linear" or "probit" -- these are the link functions) % logit
n_realizations = 50; % number of initial guesses to choose for beta % 5000


% Choose mb (seeking to do computations for a range of mc)
%
% initialize
if mb==0 % if mb = 0, must start mc from 1, but if mb>0, start from 0
    startmc = 1;
    mcvector  = NaN([maxmc,1]);
else
    startmc = 0;
    mcvector = NaN([maxmc+1,1]);
end
good_proportion_vector = NaN([maxmc+1-startmc,1]);
bad_proportion_vector = NaN([maxmc+1-startmc,1]);
if strcmp(whether_use_logistic,"y")
    l_proportion_vector = NaN([maxmc+1-startmc,1]);
end

% Loop over all mc values
%
for mc = startmc : maxmc 
    m = mb+mc;
    mcvector(mc+1-startmc) = mc;
    
    % define functions
    [g,G,Gprime,Gprimeprime,G_generate,weight_function_g,weight_function_b,sup_norm] = ...
        functions_define(method);
    
    % Generate data
    [v,y] = generate_data(n,mb,mc,G_generate,whether_randomize,whether_write_data);
    
    % Do computations (good/SAMLE method) for different initial guesses for beta
    %
    n_iterations = zeros(n_realizations,1);
    beta_final = zeros(n_realizations,m);
    %
    for realization = 1: n_realizations
        % Initialize
        rng shuffle  % so that a different initial beta is used each time
        beta_guess(realization,1:m)= randn(m,1);
        beta_old = beta_guess(realization,:);
        % Call Newton-Raphson algorithm with good/SAMLE weighting function
        [beta_new,r] = newton_raphson(epsNR,maxNR,n,m,G,Gprime,Gprimeprime,...
            weight_function_g,sup_norm,beta_old,y,v);
        % Record data
        n_iterations(realization) = r-1;
        beta_final(realization,:) = beta_new;        
    end
    
    % Find proportion which converged to best beta value
    beta_rounded = round(beta_final,decimal_places_beta-1);
    best_beta = mode(beta_rounded);
    good_betas = ismember(beta_rounded,best_beta,'rows');
    good_proportion = sum(good_betas)/n_realizations;
    good_proportion_vector(mc+1-startmc) = good_proportion;
    
    
    
    
    % Do computations (bad/standard method) for different initial guesses for beta
    %
    n_iterations_b = zeros(n_realizations,1);
    beta_final_b = zeros(n_realizations,m);
    %
    for realization = 1: n_realizations
        % Initialize
        rng shuffle  % so that a different initial beta is used each time
        beta_guess(realization,:)= randn(m,1);
        beta_old = beta_guess(realization,:);
        % Call Newton-Raphson algorithm with bad/standard weighting function
        [beta_new,r] = newton_raphson(epsNR,maxNR,n,m,G,Gprime,Gprimeprime,...
            weight_function_b,sup_norm,beta_old,y,v);
        % Record data
        n_iterations_b(realization) = r-1;
        beta_final_b(realization,:) = beta_new;
        
    end
    
    % Find proportion which converged to best beta value
    beta_rounded_b = round(beta_final_b,decimal_places_beta-1);
    best_beta_b = mode(beta_rounded_b);
    good_betas_b = ismember(beta_rounded_b,best_beta_b,'rows');
    bad_proportion = sum(good_betas_b)/n_realizations;
    bad_proportion_vector(mc+1-startmc) = bad_proportion;
    
    % If an explicit Newton-Raphson written for the logistic case is to be also
    % used and compared
    if strcmp(whether_use_logistic,"y")
        n_iterations_l = zeros(n_realizations,1);
        beta_final_l = zeros(n_realizations,m);
        %
        for realization = 1: n_realizations
            % Initialize
            rng shuffle  % so that a different initial beta is used each time
            beta_guess(realization,:)= randn(m,1);
            beta_old = beta_guess(realization,:);
            % Call Newton-Raphson algorithm explictly for logistic
        	[beta_new,r] = newton_raphson_logistic(epsNR,maxNR,n,m,...
                sup_norm,beta_old,y,v);
            % Record data
            n_iterations_l(realization) = r-1;
            beta_final_l(realization,:) = beta_new;
        
        end
        % Find proportion which converged to best beta value
        beta_rounded_l = round(beta_final_l,decimal_places_beta-1);
        best_beta_l = mode(beta_rounded_l);
        good_betas_l = ismember(beta_rounded_l,best_beta_l,'rows');
        l_proportion = sum(good_betas_l)/n_realizations;
        l_proportion_vector(mc+1-startmc) = l_proportion;
    end

    
end




% plot proportions versus mc to compare good/SAMLE with bad/standard
%
figure(10)
plot(mcvector,good_proportion_vector,'ko','MarkerSize',7)
xlabel('Number of continuous variables')
ylabel('Convergent proportion')
pbaspect([2 1 1])
hold on
ylim([-0.1 1.1])
plot(mcvector,bad_proportion_vector,'kx','MarkerSize',9)
if strcmp(whether_use_logistic,"y")
    plot(mcvector,l_proportion_vector,'k.','MarkerSize',11)
end
ax = gca; ax.FontSize = 16;
%figfilename = strcat("mb",num2str(mb),"_nsim",num2str(n_realizations));
%saveas(10,figfilename,'fig');
%saveas(10,figfilename,'eps');





